## ----------------------------------------------------------
## Code to clean Kilosa study studyKils
## ----------------------------------------------------------

#### Load data ####

## Individual level data
studyKil <- read.csv("../data/kilosa/SurveyMain.csv", 
                    header = TRUE, stringsAsFactors = FALSE) # One shot survey data

KilCTplacard <- read.csv("../data/kilosa/ClassroomTeacherPlacardRaised.csv", 
                    header = TRUE, stringsAsFactors = FALSE) # asked question in classroom teacher mtg

KilHTplacard <- read.csv("../data/kilosa/HTPlacardGroup.csv", 
                    header = TRUE, stringsAsFactors = FALSE) # met with HT

## School and Meeting level data
KilSchoolMain <- read.csv("../data/kilosa/SchoolMain.csv", # School covars
                    header = TRUE, stringsAsFactors = FALSE) 

KilSchoolGrades <- read.csv("../data/kilosa/SchoolGrades.csv", # School demographics by grade
                    header = TRUE, stringsAsFactors = FALSE) 

KilSessions <- read.csv("../data/kilosa/VPReportSession.csv", # VP meetings
                    header = TRUE, stringsAsFactors = FALSE) 

KilCTmeetings <- read.csv("../data/kilosa/ClassroomTeacherMain.csv", # Meetings with classroom teachers (not optional)
                    header = TRUE, stringsAsFactors = FALSE) 

KilHTmeetings <- read.csv("../data/kilosa/HTmeetingMain.csv", # Meetings with head teachers (optional)
                    header = TRUE, stringsAsFactors = FALSE) 


#### Clean data ####

## Individual level data

# Survey times
studyKil$start <- strptime(studyKil$start_time, format="%d%b%Y  %H:%M:%S")
studyKil$end <- strptime(studyKil$end_time, format="%d%b%Y  %H:%M:%S")

studyKil$minutes <- as.numeric(studyKil$end - studyKil$start)

# Session morning or afternoon
studyKil$morning <- ifelse(studyKil$SessionID == "Morning Session", 1, 0)

# Treatments
studyKil$TreatCat <- case_when(studyKil$ClassID == "Survey Only"~"SO",
                                studyKil$ClassID == "Information Workshop"~"IW",
                                studyKil$ClassID == "Validated Participation"~"VP",
                                TRUE~NA_character_) 
                        
studyKil$TreatCat <- factor(studyKil$TreatCat, 
                            levels=c("SO",
                                     "IW",
                                     "VP"))

studyKil$VP <- ifelse(studyKil$ClassID == "Validated Participation", 1, 0) #indicator for VP

studyKil$IW <- ifelse(studyKil$ClassID == "Information Workshop", 1, 0) #indicator for IW

studyKil$Mtg <- ifelse(studyKil$ClassID %in% c("Validated Participation",
                                           "Information Workshop"), 1, 0) #indicator for VP or IW
studyKil$SchoolID <- studyKil$schoolID

# Covariates
studyKil$Female <- ifelse(studyKil$gender_conf == "Female", 1, 0)

studyKil$HHhead <- ifelse(studyKil$hoh == "Yes", 1, 0)

studyKil$age[studyKil$age == -99] <- 2018 - studyKil$year_born[studyKil$age == -99] 

studyKil$AgeBin <- ifelse(studyKil$age <= 30, 1,
                         ifelse(studyKil$age > 30 & studyKil$age <= 40, 2,
                         ifelse(studyKil$age > 40 & studyKil$age <= 50, 3,
                         ifelse(studyKil$age > 50 & studyKil$age <= 60, 4,
                         ifelse(studyKil$age > 60, 5, NA))))) 

studyKil$Language <- studyKil$main_lang
  
studyKil$Language[studyKil$main_lang == "Other: (specify)"] <- studyKil$main_lang_other[studyKil$main_lang == 
                                                                                 "Other: (specify)"] 

studyKil$Language2 <- ifelse(studyKil$Language %in% c("Kiswahili", "Kaguru", "Sukuma", "Luguru"), 
                           studyKil$Language,
                           "Other")

studyKil$Language2 <- factor(studyKil$Language2,
                                   levels= c("Kiswahili", "Kaguru", "Sukuma", "Luguru", "Other"))

studyKil$Kiswahili <- ifelse(studyKil$Language == "Kiswahili", 1, 0)

studyKil$Children <- studyKil$u12_cared

studyKil$Standard <- ifelse(studyKil$child_grade == "Standard 2", 2,
                           ifelse(studyKil$child_grade == "Standard 3", 3,
                                  ifelse(studyKil$child_grade == "Standard 4", 4,
                                         ifelse(studyKil$child_grade == "Standard 5", 5, NA))))

studyKil$Parent <- ifelse(studyKil$child_relat == "Parent", 1,
                           ifelse(is.na(studyKil$child_relat) == F, 0, NA))

studyKil$econindexSUM <- apply(studyKil[, c("asset_metal_roof", "asset_cement_floor",
                "asset_electricity", "asset_water",
                "asset_bike", "asset_car_moto_baj",
                "asset_radio", "asset_mb_phone",
                "asset_internet", "asset_tel")], 1,
                FUN = function(x) length(which(x =='Yes')))

studyKil$econindexSUM_rescaled <- scales::rescale(studyKil$econindexSUM, to = c(1, 5))

studyKil$Poor <- ifelse(studyKil$econindexSUM < 4, 1,
                    ifelse(studyKil$econindexSUM >= 4, 0,
                           NA))

studyKil$Education <- factor(studyKil$edu_level, 
                            levels=c("No schooling", "Some primary school", 
                                     "Completed primary school", "Some secondary school",
                                     "Completed secondary school", "Some post-secondary school",
                                     "Diploma course/certificate", "University degree"))

studyKil$Education_num <- as.numeric(studyKil$Education)

studyKil$Education_num_rescaled <- scales::rescale(studyKil$Education_num, to = c(1, 5))

studyKil$Ethnicity <-studyKil$tribe 
studyKil$Ethnicity[studyKil$tribe == "Other: (specify)"] <- studyKil$tribe_other[studyKil$tribe == 
                                                                                 "Other: (specify)"] 

studyKil$Ethnicity2 <- ifelse(studyKil$Ethnicity %in% c("Wakaguru", "Waluguru", "Wasagara", "Wapogoro"), 
                           studyKil$Ethnicity,
                           "Other")

studyKil$Ethnicity2 <- factor(studyKil$Ethnicity2,
                                   levels= c("Wakaguru", "Waluguru", "Wasagara", "Wapogoro", "Other"))

studyKil$Wakaguru <- ifelse(studyKil$Ethnicity == "Wakaguru", 1, 0)
  
studyKil$Literate <- ifelse(studyKil$can_read == "Yes", 1, 
                          ifelse(studyKil$can_read == "No", 0, NA))

studyKil$Muslim <- ifelse(studyKil$religion == "Islam", 1, 0)

studyKil$Political <- ifelse(studyKil$politics == "Yes", 1, 
                            ifelse(studyKil$politics == "No", 0, NA))

# School knowledge
studyKil$TeacherName <- ifelse(studyKil$know_teacher == "Yes", 1, 
                              ifelse(studyKil$know_teacher == "No", 0, NA))

studyKil$AskAbsent <- ifelse(studyKil$ask_absent == "Yes", 1, 
                              ifelse(studyKil$ask_absent == "No", 0, NA))

studyKil$AskHW <- ifelse(studyKil$ask_hw == "Yes", 1, 
                              ifelse(studyKil$ask_hw == "No", 0, NA))

studyKil$SpeakPerform <- ifelse(studyKil$speak_perform == "Yes", 1,
                              ifelse(studyKil$speak_perform == "No", 0, NA))

studyKil$ChildBreakfast <- ifelse(studyKil$child_bfast == "Yes", 1, 
                              ifelse(studyKil$child_bfast == "No", 0, NA))

studyKil$SCmember <- ifelse(studyKil$cmt_member == "Yes", 1, 
                              ifelse(studyKil$cmt_member == "No", 0, NA))

# Outcomes
GEF_qs <- c("GEF1", "GEF2", "GEF3", "GEF4", "GEF5", 
            "GEF6", "GEF7", "GEF8", "GEF9", "GEF10")

GEF <- studyKil[, GEF_qs]

GEF[GEF == "Completely true"] <- "4"
GEF[GEF == "Mostly true"] <- "3"
GEF[GEF == "Somewhat true"] <- "2"
GEF[GEF == "Not at all true"] <- "1"

GEF <- data.frame(lapply(GEF, as.numeric), stringsAsFactors=FALSE)

GEF.SO <- studyKil[studyKil$ClassID == "Survey Only", GEF_qs] # for SO group

GEF.SO[GEF.SO == "Completely true"] <- "4"
GEF.SO[GEF.SO == "Mostly true"] <- "3"
GEF.SO[GEF.SO == "Somewhat true"] <- "2"
GEF.SO[GEF.SO == "Not at all true"] <- "1"

GEF.SO <- data.frame(lapply(GEF.SO, as.numeric), stringsAsFactors=FALSE)

studyKil$GEFscore <- rowSums(GEF, na.rm = T)

EEF_qs <- c("EEF1", "EEF2", "EEF3", "EEF4", 
            "EEF5", "EEF6", "EEF7", "EEF8", "EEF9") 

EEF <- studyKil[, EEF_qs]

EEF <- data.frame(lapply(EEF, as.numeric), stringsAsFactors=FALSE)

EEF.SO <- studyKil[studyKil$ClassID == "Survey Only", EEF_qs] # for SO group

EEF.SO <- data.frame(lapply(EEF.SO, as.numeric), stringsAsFactors=FALSE)

studyKil[, EEF_qs] <- EEF

studyKil$EEFscore <- rowMeans(EEF, na.rm = T) #total score
studyKil$EEFinternal <- rowMeans(EEF[, 1:4], na.rm = T) #internal efficacy
studyKil$EEFexternal <- rowMeans(EEF[, 5:9], na.rm = T) #external efficacy
studyKil$EEFindiv <- rowMeans(EEF[, c(2,3,4,6,7)], na.rm = T) #individual efficacy
studyKil$EEFgroup <- rowMeans(EEF[, c(1,5,8,9)], na.rm = T) #group related efficacy

studyKil$PubGoodMtg <- studyKil$com_01
studyKil$PubGoodDirect <- studyKil$com_02

studyKil$RespectedbyParents <- studyKil$hon01
studyKil$RespectedbyAuth <- studyKil$hon02
studyKil$RespectedIdeas <- studyKil$hon03

studyKil$Consult1 <- ifelse(grepl("Never", studyKil$opin01), 0,
                              ifelse(grepl("Rarely", studyKil$opin01), 1,
                                     ifelse(grepl("Usually", studyKil$opin01), 2,
                                            ifelse(grepl("Always", studyKil$opin01), 3, NA))))

studyKil$Consult2 <- ifelse(grepl("No", studyKil$opin02), 0,
                              ifelse(grepl("Maybe", studyKil$opin02), 1,
                                     ifelse(grepl("definitely", studyKil$opin02), 2, NA)))

studyKil$Consult3 <- ifelse(grepl("should not", studyKil$opin03), 0,
                              ifelse(grepl("once or twice", studyKil$opin03), 1,
                                     ifelse(grepl("consistently", studyKil$opin03), 2, NA)))

studyKil$Candidate <- ifelse(studyKil$opin04 == "Candidate A", 0,
                              ifelse(studyKil$opin04 == "Candidate B", 1, NA))

studyKil$HTComment <- ifelse(studyKil$any_concern == "Yes", 1, 0)

studyKil$Volunteer <- ifelse(studyKil$volunteer == "Yes", 1, 0)

# Asked a question in the classroom teacher mtg
RaisedPlacardIDs <- as.vector(KilCTplacard$ParentID) # yes/no raised

studyKil$RaisedPlacard <- ifelse(studyKil$ParentID %in% RaisedPlacardIDs, 1, 0) 

RaisedPlacardIDstabs <- as.data.frame(table(KilCTplacard$ParentID)) 

names(RaisedPlacardIDstabs) <- c("ParentID", "RaisedPlacardNum")

studyKil <- merge(studyKil, RaisedPlacardIDstabs,
                by.x = "ParentID", by.y = "ParentID",
                all.x = T)

studyKil$RaisedPlacardNum[is.na(studyKil$RaisedPlacardNum)] <- 0
  
# Met with HT
KilHTMeet <- as.vector(KilHTplacard$ParentID) #yes/no met with HT for optional office hours

studyKil$HTMeet <- ifelse(studyKil$ParentID %in% KilHTMeet, 1, 0) 

# Subset by treatment group
SOKil <- studyKil[studyKil$ClassID == "Survey Only",]
IWKil <- studyKil[studyKil$ClassID == "Information Workshop",]
VPKil <- studyKil[studyKil$ClassID == "Validated Participation",]

PoorKil <- studyKil[studyKil$Poor == 1,]
WealthyKil <- studyKil[studyKil$Poor == 0,]

WomenKil <- studyKil[studyKil$Female == 1,]
MenKil <- studyKil[studyKil$Female == 0,]

## School and Meeting level data

# Geographic coordinates for schools
KilSchoolMain$Latitude <- as.numeric(unlist(lapply(strsplit(KilSchoolMain$gps, ","), "[[", 1)))

KilSchoolMain$Longitude <- as.numeric(unlist(lapply(strsplit(KilSchoolMain$gps, ","), "[[", 2)))

KilSchoolMain$Altitude <- as.numeric(unlist(lapply(strsplit(KilSchoolMain$gps, ","), "[[", 3)))


# Session descriptions
KilVPsessions <- KilSessions[KilSessions$ClassID == "Validated Participation",]

KilVPsessions$parent_discu <- factor(KilVPsessions$parent_discu, 
                            levels=c("Parents were reluctant to speak; there was not much conversation(1)", 
                                     "Only one or two parents’ spoke, the rest did not (2)",
                                     "Lots of parents spoke, but they did not seem to listen or respond to each other(3)",
                                     "Lots of parents spoke, and they listened and responded to each other(4)"))

KilVPsessions$speak_most <- factor(KilVPsessions$speak_most, 
                            levels=c("More men spoke than women(1)", 
                                     "More women spoke than men(2)",
                                     "Women and men spoke up about the same amount(3)",
                                     "No one spoke up(4)"))

KilVPsessions$parent_ask <- factor(KilVPsessions$parent_ask, 
                            levels=c("Questions about logistics of the meeting, and compensation(1)", 
                                     "Questions about the material presented, and education(2)",
                                     "Questions about the study, and Twaweza(3)",
                                     "They did not ask any questions(4)"))

